typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;

#ifdef _MSC_VER
typedef unsigned __int64 u64;
#else
typedef unsigned long long u64;
#endif

typedef signed char s8;
typedef signed short s16;
typedef signed int s32;

#ifdef _MSC_VER
typedef signed __int64 s64;
#else
typedef signed long long s64;
#endif

static u8 row_cur[4 * 400] = {0};
static u8 row_next[4 * 400] = {0};

static u8 *rgb_row_cur = row_cur;
static u8 *rgb_row_next = row_next;

#define USE_ORIGINAL_BILINEAR_PLUS

static const __int64 redMask	= 0xF800F800F800F800;
static const __int64 greenMask	= 0x00FC00FC00FC00FC;
static const __int64 blueMask	= 0x00F800F800F800F8;

static const __int64 redMask2	= 0x0000F8000000F800;
static const __int64 greenMask2	= 0x000007E0000007E0;
static const __int64 blueMask2	= 0x0000001F0000001F;

#ifdef ROW16_STACK
static __declspec( naked ) void __cdecl fill_rgb_row_16(u16 *from, int src_width, u8 *row, int width)
#else
static __declspec( naked ) void __cdecl fill_rgb_row_16(/*u16 *from, int src_width, u8 *row, int width*/)
#endif
{
#ifdef ROW16_STACK
#define from		dword ptr [esp + 0x04]
#define src_width	dword ptr [esp + 0x08]
#define row			dword ptr [esp + 0x0C]
#define width		dword ptr [esp + 0x10]
#endif
	__asm {
#ifdef ROW16_STACK
		mov ecx, src_width;
		mov eax, from;
		mov edi, row;
#else
		mov edx, ecx;
#endif
		movq mm5, redMask;
		movq mm6, greenMask;
		movq mm7, blueMask;
		shr ecx, 2;
		align 4;
row16label0:
		movq mm4, qword ptr [eax];
		movq mm2, mm4;
		movq mm0, mm4;
		psllq mm2, 3;
		pand mm0, mm5;
		psrlq mm4, 3;
		pand mm2, mm7;
		pand mm4, mm6;
		por mm0, mm2;
		movq mm1, mm0;
		add eax, 8;
		punpcklbw mm0, mm4;
		punpckhbw mm1, mm4;
		movq qword ptr [edi], mm0;
		movq qword ptr [edi + 8], mm1;

		add edi, 16;
		dec ecx;
		jnz row16label0;

#ifdef ROW16_STACK
		mov ecx, width;
#else
		mov ecx, ebx;
#endif
		mov eax, dword ptr [edi - 4];
#ifdef ROW16_STACK
		sub ecx, src_width;
#else
		sub ecx, edx;
#endif
		jle row16_end;
		rep stosd;
row16_end:
		ret;
	}
#ifdef ROW16_STACK
#undef from
#undef src_width
#undef row
#undef width
#endif
}

void Bilinear(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
			  u8 *dstPtr, u32 dstPitch, int width, int height)
{
	__asm {
		mov		ebx, width;
		mov		ecx, width;
		add		ebx, 2;
		mov		eax, srcPtr;
		mov		edi, rgb_row_cur;
#ifdef ROW16_STACK
		push	ebx;
		push	edi;
		push	ecx;
		push	eax;
#endif
		call	fill_rgb_row_16;
#ifdef ROW16_STACK
		add		esp, 16;
#endif
		dec		height;
		align 4;

bilinear_y_loop:
		mov		eax, srcPtr;
		mov		ebx, width;
		add		eax, srcPitch;
		mov		ecx, width
		add		ebx, 2;
		mov		edi, rgb_row_next;
#ifdef ROW16_STACK
		push	ebx;
		push	edi;
		push	ecx;
		push	eax;
#endif
		call	fill_rgb_row_16;
#ifdef ROW16_STACK
		add		esp, 16;
#endif
		push	ebp;
		push	width;
		mov		esi, rgb_row_cur;
		mov		edi, rgb_row_next;
		mov		ebx, dstPtr;
		mov		ebp, dstPitch;
		add		ebp, ebx;
		movq	mm5, blueMask2;
		movq	mm6, greenMask2;
		align 4;

bilinear_x_loop:
		movd	mm0, dword ptr [esi];
		movq	mm1, qword ptr [esi];
		psrlq	mm0, 1;
		movq	mm3, qword ptr [edi];
		psrlq	mm1, 1;
		movq	mm7, mm0;
		movq	mm2, mm1;
		psllq	mm0, 32;
		psrlq	mm3, 1;
		por		mm0, mm7;
		paddb	mm2, mm3;
		paddb	mm0, mm1;
		psrlq	mm2, 1;
		movq	mm1, mm0;
		movq	mm7, mm2;
		movd	eax, mm7;
		psllq	mm7, 32;
		movd	mm3, eax;
		movq	mm4, mm0;
		por		mm3, mm7;
		paddb	mm2, mm3;
		psrlq	mm0, 8;
		movq	mm7, redMask2;
		psrlq	mm1, 5;
		pand	mm0, mm7;
		psrlq	mm4, 3;
		pand	mm1, mm6;
		movq	mm3, mm2;
		por		mm0, mm1;
		pand	mm4, mm5;
		psrlq	mm3, 3;
		movq	mm1, mm2;
		pand	mm3, mm5;
		psrlq	mm2, 5;
		por		mm0, mm4;
		psrlq	mm1, 8;
		pand	mm2, mm6;
		pextrw	ecx, mm0, 2;
		por		mm3, mm2;
		pand	mm1, mm7;
		shl		ecx, 16;
		por		mm1, mm3;
		pextrw	eax, mm0, 0;
		or		eax, ecx;
		pextrw	ecx, mm1, 2;
		mov		dword ptr [ebx], eax;
		pextrw	edx, mm1, 0;
		shl		ecx, 16;
		add		esi, 4;
		or		edx, ecx;
		add		edi, 4;
		mov		dword ptr [ebp], edx;
		add		ebx, 4;
		add		ebp, 4;

		dec		dword ptr [esp];
		ja		bilinear_x_loop;
		pop		ebx;
		pop		ebp;
		mov		eax, rgb_row_next;
		mov		edx, dstPitch;
		xchg	eax, rgb_row_cur;
		shl		edx, 1;
		dec		height;
		mov		rgb_row_next, eax;
		lahf;
		cmovne	ebx, srcPitch;
		add		dstPtr, edx;
		add		srcPtr, ebx;
		sahf;

		jns		bilinear_y_loop;
		//emms;
	}
}

void BilinearPlus(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
				  u8 *dstPtr, u32 dstPitch, int width, int height)
{
	__asm {
		mov		ebx, width;
		mov		ecx, width;
		add		ebx, 2;
		mov		eax, srcPtr;
		mov		edi, rgb_row_cur;
#ifdef ROW16_STACK
		push	ebx;
		push	edi;
		push	ecx;
		push	eax;
#endif
		call	fill_rgb_row_16;
#ifdef ROW16_STACK
		add		esp, 16;
#endif
		dec		height;
		align 4;

bilinearP_y_loop:
		mov		eax, srcPtr;
		mov		ebx, width;
		add		eax, srcPitch;
		mov		ecx, width
		add		ebx, 2;
		mov		edi, rgb_row_next;
#ifdef ROW16_STACK
		push	ebx;
		push	edi;
		push	ecx;
		push	eax;
#endif
		call	fill_rgb_row_16;
#ifdef ROW16_STACK
		add		esp, 16;
#endif
		push	ebp;
		push	width;
		mov		esi, rgb_row_cur;
		mov		edi, rgb_row_next;
		mov		ebx, dstPtr;
		mov		ebp, dstPitch;
		add		ebp, ebx;
		align 4;

bilinearP_x_loop:
#ifdef USE_ORIGINAL_BILINEAR_PLUS
		movd	mm0, dword ptr [esi];
		movd	mm4, dword ptr [esi + 4];
		psrlq	mm0, 3;
		movq	mm2, mm4;
		movq	mm1, mm0;
		movd	mm5, qword ptr [edi + 4];
		psllq	mm0, 2;
		movd	mm3, dword ptr [edi];
		psrlq	mm4, 1;
		movq	mm6, mm3;
		paddb	mm4, mm0;
		psrlq	mm2, 2;
		paddb	mm0, mm1;
		psrlq	mm3, 3;
		movq	mm7, mm6;
		paddb	mm2, mm3;
		psllq	mm4, 32;
		paddb	mm0, mm2;
		psllq	mm6, 30;
		movq	mm1, mm4;
		psrlq	mm5, 1;
		por		mm0, mm4;
		psrlq	mm1, 1;
		movq	mm2, mm5;
		psrlq	mm7, 1;
		psllq	mm5, 31;
		paddb	mm1, mm6;
		movq	mm4, mm0;
		paddb	mm5, mm1;
		paddb	mm2, mm7;
		movq	mm1, mm0;
		por		mm2, mm5;
#else
		movd	mm0, dword ptr [esi];
		movd	mm1, dword ptr [esi + 4];
		movq	mm4, mm0;
		movd	mm3, dword ptr [edi];
		psrlq	mm0, 1;
		movq	mm6, mm3;
		psrlq	mm1, 3;
		movq	mm7, mm3;
		psrlq	mm4, 3;
		movq	mm5, mm0;
		psrlq	mm3, 3;
		paddb	mm0, mm1;
		psllq	mm1, 34;
		paddb	mm0, mm3;
		psllq	mm5, 32;
		paddb	mm0, mm4;
		psllq	mm6, 30;
		paddb	mm1, mm5;
		por		mm0, mm1;
		movd	mm5, qword ptr [edi + 4];
		psrlq	mm1, 1;
		movq	mm2, mm5;
		psrlq	mm7, 1;
		psllq	mm5, 30;
		paddb	mm1, mm6;
		psrlq	mm2, 1;
		movq	mm4, mm0;
		paddb	mm5, mm1;
		paddb	mm2, mm7;
		movq	mm1, mm0;
		por		mm2, mm5;
#endif
		movq	mm6, greenMask2;
		movq	mm5, blueMask2;
		psrlq	mm0, 8;
		movq	mm7, redMask2;
		psrlq	mm1, 5;
		pand	mm0, mm7;
		psrlq	mm4, 3;
		pand	mm1, mm6;
		movq	mm3, mm2;
		por		mm0, mm1;
		pand	mm4, mm5;
		psrlq	mm3, 3;
		movq	mm1, mm2;
		pand	mm3, mm5;
		psrlq	mm2, 5;
		por		mm0, mm4;
		psrlq	mm1, 8;
		pand	mm2, mm6;
		pextrw	ecx, mm0, 2;
		por		mm3, mm2;
		pand	mm1, mm7;
		shl		ecx, 16;
		por		mm1, mm3;
		pextrw	eax, mm0, 0;
		or		eax, ecx;
		pextrw	ecx, mm1, 2;
		mov		dword ptr [ebx], eax;
		pextrw	edx, mm1, 0;
		shl		ecx, 16;
		add		esi, 4;
		or		edx, ecx;
		add		edi, 4;
		mov		dword ptr [ebp], edx;
		add		ebx, 4;
		add		ebp, 4;

		dec		dword ptr [esp];
		ja		bilinearP_x_loop;
		pop		ebx;
		pop		ebp;
		mov		eax, rgb_row_next;
		mov		edx, dstPitch;
		xchg	eax, rgb_row_cur;
		shl		edx, 1;
		dec		height;
		mov		rgb_row_next, eax;
		lahf;
		cmovne	ebx, srcPitch;
		add		dstPtr, edx;
		add		srcPtr, ebx;
		sahf;

		jns		bilinearP_y_loop;
		//emms;
	}
}
